import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
1. Load the dataset
park = pd.read_csv("Data-Parkinsons.csv")
park.shape
park.head()
park.info()
park.isnull().values.any()
park.describe()
The following can be understood from initial data analysis
- There are 195 entries.
- There are no null values.
- There are 22 float,1 integer,1 object column.
- The shape of the data is 8 rows and 23 columns.
park["MDVP:Fo(Hz)"].describe()
sns.distplot(park["MDVP:Fo(Hz)"])
park["MDVP:Fhi(Hz)"].describe()
sns.distplot(park["MDVP:Fhi(Hz)"])
park["MDVP:Flo(Hz)"].describe()
sns.distplot(park["MDVP:Flo(Hz)"])
park["MDVP:Jitter(%)"].describe()
sns.distplot(park["MDVP:Jitter(%)"])
park["MDVP:Jitter(Abs)"].describe()
sns.distplot(park["MDVP:Jitter(Abs)"])
park["MDVP:RAP"].describe()
sns.distplot(park["MDVP:RAP"])
park["MDVP:PPQ"].describe()
sns.distplot(park["MDVP:PPQ"])
park["Jitter:DDP"].describe()
sns.distplot(park["Jitter:DDP"])
park["MDVP:Shimmer"].describe()
sns.distplot(park["MDVP:Shimmer"])
park["MDVP:Shimmer(dB)"].describe()
sns.distplot(park["MDVP:Shimmer(dB)"])
park["Shimmer:APQ3"].describe()
sns.distplot(park["Shimmer:APQ3"])
park["Shimmer:APQ5"].describe()
sns.distplot(park["Shimmer:APQ5"])
park["MDVP:APQ"].describe()
sns.distplot(park["MDVP:APQ"])
park["Shimmer:DDA"].describe()
sns.distplot(park["Shimmer:DDA"])
park["NHR"].describe()
sns.distplot(park["NHR"])
park["HNR"].describe()
sns.distplot(park["HNR"])
park["status"].describe()
sns.countplot(park["status"])
park["RPDE"].describe()
sns.distplot(park["RPDE"])
park["DFA"].describe()
sns.distplot(park["DFA"])
park["spread1"].describe()
sns.distplot(park["spread1"])
park["spread2"].describe()
sns.distplot(park["spread2"])
park["D2"].describe()
sns.distplot(park["D2"])
park["PPE"].describe()
sns.distplot(park["PPE"])
From the univariate analysis, the following can be inferred
- From count plot out of 195 patients checked around 140 patients are suspected to have Parkinson.
- The columns D2,spread2,spread1 have a normal bell-shaped curve.
- The columns DFA, HNR are skewed towards the right.
- The rest of all columns are skewed towards left.
- The spread, mean, IQR, etc have been calculated.
sns.pairplot(park[["MDVP:Fo(Hz)","MDVP:Fhi(Hz)","MDVP:Flo(Hz)","MDVP:Jitter(%)",
"MDVP:Jitter(Abs)","MDVP:RAP","MDVP:PPQ","Jitter:DDP","MDVP:Shimmer",
"MDVP:Shimmer(dB)","Shimmer:APQ3","Shimmer:APQ5","MDVP:APQ","Shimmer:DDA","NHR",
"HNR","status","RPDE","DFA","spread1","spread2","D2","PPE"]])
core=park.corr()
park.corr()
plt.figure(figsize = (21,15))
sns.heatmap(core,annot=True)
From the bivariate analysis, the following can be inferred
- Around 10 variables have a strong correlation.
- The pair plot of all the variables is plotted.
- The columns Jitter: DDP, DFA, MDVP: Fhi, NHR are deleted due to least significant of the variables.
park.drop("name",axis=1,inplace=True)
park.drop("Jitter:DDP",axis=1,inplace=True)
park.drop("DFA",axis=1,inplace=True)
park.drop("MDVP:Fhi(Hz)",axis=1,inplace=True)
park.drop("NHR",axis=1,inplace=True)
X= park.drop("status",axis=1)
Y= park["status"]
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=1)
x_train.head()
rep_0 = SimpleImputer(missing_values=0, strategy="mean")
cols=x_train.columns
x_train = pd.DataFrame(rep_0.fit_transform(x_train))
x_test = pd.DataFrame(rep_0.fit_transform(x_test))
x_train.columns = cols
x_test.columns = cols
x_train.head()
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
model_score = model.score(x_test, y_test)
print(model_score)
confusion_matrix(y_test, y_predict)
cm=confusion_matrix(y_test, y_predict)
sns.heatmap(cm, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,y_predict, labels=[1, 0]))
park.groupby(["status"]).count()
XScaled = X.apply(zscore)
XScaled.describe()
X_train, X_test, Y_train, Y_test = train_test_split(XScaled, Y, test_size=0.30, random_state=42)
NNH = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' )
NNH.fit(X_train, Y_train)
predicted_labels = NNH.predict(X_test)
NNH.score(X_test, Y_test)
a=confusion_matrix(Y_test, predicted_labels)
print("confusion matirx of KNN classifier = \n",a)
knnm=metrics.confusion_matrix(Y_test, predicted_labels)
knn_m = pd.DataFrame(knnm, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(knn_m, annot=True)
print("Classification Report")
print(metrics.classification_report(Y_test, predicted_labels, labels=[1, 0]))
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(x_train, y_train)
prediction = svc_model .predict(x_test)
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))
print("Confusion Matrix:\n",confusion_matrix(y_test,prediction))
knnm=metrics.confusion_matrix(y_test,prediction)
knn_m = pd.DataFrame(knnm, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(knn_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,prediction, labels=[1, 0]))
svc_model = SVC(kernel='rbf')
svc_model.fit(x_train, y_train)
svc_model = SVC(kernel='poly')
svc_model.fit(x_train, y_train)
prediction = svc_model.predict(x_test)
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))
svc_model = SVC(kernel='sigmoid')
svc_model.fit(x_train, y_train)
prediction = svc_model.predict(x_test)
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))
feature_cols = ['MDVP:Fo(Hz)',' MDVP:Flo(Hz)','MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ','MDVP:Shimmer',
'MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ','Shimmer:DDA','HNR','RPDE','spread1','spread2','D2','PPE']
clf = DecisionTreeClassifier()
clf=clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
tree=metrics.confusion_matrix(y_test,y_pred)
tree_m = pd.DataFrame(tree, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(tree_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,y_pred, labels=[1, 0]))
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('park.png')
Image(graph.create_png())
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('park1.png')
Image(graph.create_png())
rf = RandomForestClassifier(n_estimators = 70,criterion="entropy")
rf = rf.fit(x_train, y_train)
predrf= rf.predict(x_test)
accrf = accuracy_score(y_test, predrf)
print(accrf)
print("Confusion Matrix:\n",confusion_matrix(y_test,predrf))
rf1=metrics.confusion_matrix(y_test,predrf)
rf1_m = pd.DataFrame(rf1, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(rf1_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,predrf, labels=[1, 0]))
abc = AdaBoostClassifier(n_estimators=100,learning_rate=1)
model = abc.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
ab=metrics.confusion_matrix(y_test,y_pred)
ab_m = pd.DataFrame(ab, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(ab_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,y_pred, labels=[1, 0]))
bag = BaggingClassifier(n_estimators=100, max_samples= .7, bootstrap=True, oob_score=True, random_state=22)
bag= bag.fit(x_train, y_train)
predBAG =bag.predict(x_test)
accBAG = accuracy_score(y_test, predBAG)
print(accBAG)
print("Confusion Matrix:\n",confusion_matrix(y_test,predBAG))
bag=metrics.confusion_matrix(y_test,predBAG)
bag_m = pd.DataFrame(bag, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(bag_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,predBAG, labels=[1, 0]))
grad = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, random_state=42)
grad = grad.fit(x_train, y_train)
predgrad =grad.predict(x_test)
accgrad = accuracy_score(y_test, predgrad)
print(accgrad)
print("Confusion Matrix:\n",confusion_matrix(y_test,predgrad))
grad=metrics.confusion_matrix(y_test,predgrad)
grad_m = pd.DataFrame(grad, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(grad_m, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,predgrad, labels=[1, 0]))
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 =SVC(C= .1, kernel='linear', gamma= 1)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)
print('3-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3, sclf],
['KNN',
'Random Forest',
'Naive Bayes',
'StackingClassifier']):
scores = model_selection.cross_val_score(clf, X, Y,
cv=3, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
From the above models following are the accuracy
- Logistic Regression 78%
- KNN classifier 91%
- Support vector classifier 88%
- Decision Tree 86%
- Random Forest 90%
- Ada boosting classifier 86%
- Bagging Classifier 88%
- Gradient boosting Classifier 90%
- Meta-classifier(Stacking classifier) 87%. The meta classifier used was logistic regression.
From the above models, we get that
- The best model is the KNN classifier. It has an accuracy of 92%.\ The recall also is 92 which is good.\ Thus for the above reason, we select the KNN classifier is selected.